from elasticsearch import Elasticsearch
from elasticsearch import helpers

HOST = 'localhost'


def create_doc(d):
    res = dict()
    res['feature'] = ' '.join(tokenize(d[2]))
    res['img_name'] = d[0]
    return res


def tokenize(b64_str, str_len=4):
    length = len(b64_str)
    n = int(length / str_len)
    out = []
    for i in range(n):
        out.append(b64_str[int(i * str_len): int((i+1) * str_len)])
    return out


if __name__ == '__main__':
    import sys

    input_file = sys.argv[1]

    es = Elasticsearch(hosts='%s:9200' % HOST)

    fd = open(input_file)
    actions = []
    n = 0
    for l in fd:
        d = l.strip().split('\t')
        if len(d) != 3:
            continue

        doc = create_doc(d)
        print(doc)
        action = {
            "_index": 'img_test3',
            "_type": 'fulltext',
            "_source": doc,
            "_id": doc['img_name']
        }

        n += 1
        actions.append(action)
        if n % 1000 == 0:
            helpers.bulk(es, actions)
            del actions[:]

    helpers.bulk(es, actions)
